#import python packages for data analysis
import pandas as pd, numpy as np
import seaborn as sns
import altair as alt
import matplotlib.pyplot as plt
%matplotlib inline
import pandas_profiling
#url='https://github.com/aarondov/voila-heroku/blob/master/notebooks/dflowtomid.xlsx'
#read source file into iPython Env.
dflm=pd.read_excel('dflowtomid.xlsx')
pd.options.display.float_format ='{:,.0f}'.format
#dflm['Dollar Threshold ID'] = dflm['Dollar Threshold ID'].astype('category')
dflm.shape
pandas_profiling.ProfileReport(dflm)
missingvalues_total = dflm.isnull().sum().sort_values(ascending=False)
missingvalues_total
Summarize the data with a pivot tables.
dflm.pivot_table(index=['Dollar Threshold ID','Dollar Thresholds'],values='revenues',margins=True,aggfunc=['sum','count'])
Piv_patienttype=dflm.pivot_table(index=['Patient Type'],values='revenues',margins=True,aggfunc=['sum','count'])
Piv_patienttype
import altair as alt
dflm=pd.read_excel('dflowtomid.xlsx')
alt.Chart(dflm).mark_bar().encode(
x=alt.X('Dollar Thresholds', title='Count of Dollar Thresholds', sort=alt.EncodingSortField(
field='Dollar Thresholds',
op='count',
order='descending')),
y='count(Dollar Thresholds)',
color='agegroup:N'
)
import altair as alt
alt.Chart(dflm).mark_bar().encode(
x=alt.X('Patient Type', title='Revenues by Patient Type', sort=alt.EncodingSortField(
field='revenues',
op='sum',
order='descending')),
y='sum(revenues)',
color='agegroup:N'
)
import altair as alt
dflm=pd.read_excel('dflowtomid.xlsx')
alt.Chart(dflm).mark_bar().encode(
x=alt.X('Patient Type', title='Revenues by Patient Type', sort=alt.EncodingSortField(
field='revenues',
op='count',
order='descending')),
y='count(revenues)',
color='agegroup:N'
)
Pivot data by aging categories.The majority of accounts are 151+.
Piv_agegroup=dflm.pivot_table(index=['agegroup'],values='revenues',margins=True,aggfunc=['sum','count','mean'])
Piv_agegroup
Create a pairs plot with seaborn.
#create pair plots to better understand the data
'''
This basic pair plots leverages seaborn's statistical graphing package.
'''
dflm=pd.read_excel('dflowtomid.xlsx')
sns.set(style="ticks", color_codes=True)
dflm=sns.pairplot(dflm)
#plt.show()
Leverage altair to create a scatter plot of revenues by age groups. The scatter plot clearly highlights that the majority of accounts are aged and under $20k.
#Leverage altair to create a scatter plot of revenues by account ageing categories.
# The scatter plot clearly highlights that the majority of accounts are aged and under $20k.
import altair as alt
dflm=pd.read_excel('dflowtomid.xlsx')
chart = alt.Chart(dflm).mark_circle().encode(
x='age',
y='revenues',
color='agegroup',
)
chart
The graph below is also a pair plot, but with a different graph package. These graphs are interactive. Use your cursor to move points on one of the graphs.
#The graph below is also a pair plot, but with a different graph package. These graphs are interactive.
#Use your cursor to move points on one of the graphs.
alt.Chart(dflm).mark_circle().encode(
alt.X(alt.repeat("column"), type='quantitative'),
alt.Y(alt.repeat("row"), type='quantitative'),
color='agegroup:N'
).properties(
width=150,
height=150
).repeat(
row=['revenues', 'age'],
column=['age', 'revenues']
).interactive()
stripplot = alt.Chart(dflm, width=80).mark_circle(size=10).encode(
x=alt.X(
'jitter:Q',
title=None,
axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
scale=alt.Scale(),
),
y=alt.Y('revenues:Q'),
color=alt.Color('agegroup:N', legend=None),
column=alt.Column(
'agegroup:N',
header=alt.Header(
labelAngle=-90,
titleOrient='top',
labelOrient='bottom',
labelAlign='right',
labelPadding=3,
),
),
).transform_calculate(
#Generate Gaussian jitter with a Box-Muller transform
jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
).configure_facet(
spacing=0
).configure_view(
stroke=None
)
stripplot
dflm=pd.read_excel('dflowtomid.xlsx')
stripplot = alt.Chart(dflm, width=80).mark_circle(size=10).encode(
x=alt.X(
'jitter:Q',
title=None,
axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
scale=alt.Scale()
),
y=alt.Y('revenues:Q'),
color=alt.Color('Dollar Thresholds:N', legend=None),
column=alt.Column(
'Dollar Thresholds:N',sort=alt.EncodingSortField(
field='Dollar Threshold ID',
order='ascending'),
header=alt.Header(
labelAngle=-90,
titleOrient='top',
labelOrient='bottom',
labelAlign='right',
labelPadding=3,
),
),
).transform_calculate(
#Generate Gaussian jitter with a Box-Muller transform
jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
).configure_facet(
spacing=0
).configure_view(
stroke=None
).properties(width=75)
stripplot
0-500 has the highest count\ 20k-25k= largest revenue segment
dflm.groupby(['Dollar Threshold ID','Dollar Thresholds'])['revenues'].agg(['sum','count']).sort_values(by=['sum','count'], ascending=False)
#Brushing Scatter Plot to show data on a table
#new df low to mid range dollars
dflm=pd.read_excel('dflowtomid.xlsx')
pd.options.display.float_format ='{:,.0f}'.format
import altair as alt
source = dflm
# Brush for selection
brush = alt.selection(type='interval')
# Scatter Plot
points = alt.Chart(source).mark_point().encode(
x='age:Q',
y='revenues:Q',
color=alt.condition(brush, 'Patient Type:N', alt.value('grey'))
).add_selection(brush)
# Base chart for data tables
ranked_text = alt.Chart(source).mark_text().encode(
y=alt.Y('row_number:O',axis=None)
).transform_window(
row_number='row_number()'
).transform_filter(
brush
).transform_window(
rank='rank(row_number)'
).transform_filter(
alt.datum.rank<20
)
# Data Tables
age = ranked_text.encode(text='age:N').properties(title='Acct Age')
revenues = ranked_text.encode(text='revenues:N').properties(title='Revenues')
payor = ranked_text.encode(text='payor:N').properties(title='Payor')
patient_type = ranked_text.encode(text='Patient Type:N').properties(title='Patiet Type')
text = alt.hconcat(age, revenues, payor, patient_type) # Combine data tables
# Build chart
alt.hconcat(
points,
text
).resolve_legend(
color="independent"
)
#new df low to mid range dollars
dflm=pd.read_excel('dflowtomid.xlsx')
pd.options.display.float_format ='{:,.0f}'.format
import altair as alt
source = dflm
alt.Chart(source).mark_circle().encode(
alt.X('age:Q', bin=True),
alt.Y('revenues:Q', bin=True),
size='count()'
)
#new df low to mid range dollars
dflm=pd.read_excel('dflowtomid.xlsx')
pd.options.display.float_format ='{:,.0f}'.format
import altair as alt
source = dflm
# Configure the options common to all layers
brush = alt.selection(type='interval')
base = alt.Chart(source).add_selection(brush)
# Configure the points
points = base.mark_point().encode(
x=alt.X('age', title=''),
y=alt.Y('revenues', title=''),
color=alt.condition(brush, 'Patient Type', alt.value('grey'))
).properties(height=500)
# Configure the ticks
tick_axis = alt.Axis(labels=False, domain=False, ticks=False)
x_ticks = base.mark_tick().encode(
alt.X('age', axis=tick_axis),
alt.Y('Patient Type', title='', axis=tick_axis),
color=alt.condition(brush, 'Patient Type', alt.value('lightgrey'))
)
y_ticks = base.mark_tick().encode(
alt.X('Patient Type', title='', axis=tick_axis),
alt.Y('revenues', axis=tick_axis),
color=alt.condition(brush, 'Patient Type', alt.value('lightgrey'))
).properties(height=500)
# Build the chart
y_ticks | (points & x_ticks)
import altair as alt
df_low=pd.read_excel('dflowtomid.xlsx')
source = dflm
alt.Chart(source).mark_point().encode(
x='age:Q',
y='revenues:Q',
row='Patient Type:N'
).properties(height=200)
import altair as alt
dflm=pd.read_excel('dflowtomid.xlsx')
pd.options.display.float_format ='{:,.0f}'.format
source = dflm
alt.Chart(source).mark_boxplot(size=40).encode(
y='revenues:Q',
x='agegroup:N',
#color='Patient Type:N'
).properties(
height=800,
width=300)
import altair as alt
df_low=pd.read_excel('dflowtomid.xlsx')
pd.options.display.float_format ='{:,.0f}'.format
source = dflm
alt.Chart(source).mark_boxplot(size=40).encode(
x='revenues:Q',
y='Patient Type:N',
color='agegroup',
).properties(
width=700,
height=200)
import matplotlib.pyplot as plt
%matplotlib inline
data=dflm
ax=sns.swarmplot(x="Patient Type", y="revenues", hue='agegroup', data=dflm, size=6)
ax.set_title('Swarm Plot: Patient Types by Age Group')
fig = plt.gcf()
fig.set_size_inches(12, 6)
dflm
dflm=dflm.to_csv('eda.csv')
#create pair plots to better understand the data
'''
This basic pair plots leverages seaborn's statistical graphing package.
'''
dflm=pd.read_csv('eda.csv')
dflm_noid=dflm.iloc[:,1:7]
sns.set(style="ticks", color_codes=True)
dflm=sns.pairplot(dflm_noid)
#plt.show()
dflm_noid.shape